home *** CD-ROM | disk | FTP | other *** search
- /* copyright (c) CNIDR (see ../COPYRIGHT)
-
- 7/29/92
-
- This program is an attempt to scan into the dictionary and inverted file to determine the keywords that best
- describe a database. These could then be included in the description file
- $Log: irkeywords.c,v $
- * Revision 1.3 93/06/23 19:57:52 warnock
- * Fix from tovio@sage.ucs.uwa.edu.au for empty keywords array
- *
- * Revision 1.2 93/02/16 17:07:49 freewais
- * added AT&T patches for keyword list
- *
-
- */
-
- #include <string.h>
- #include <sys/types.h>
- #ifndef WIN32
- #include <sys/param.h>
- #endif
- #include "irdirent.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
- #include "panic.h"
- #include "ircfiles.h"
- #include "version.h"
- #include "irext.h"
- #include "irlex.h"
-
- #ifdef WIN32
- boolean look_up_total_word_count(database*);
- #endif
-
- FILE *logfile;
-
- unsigned char *dictionary_header = NULL; /* the dictionary header.
- loaded once */
-
- long number_of_blocks = 0; /* also the length of the dictionary
- header block */
-
- unsigned char *dictionary = NULL; /* this is one of the dict blocks */
-
- char *keyword[100];
- long keyvalue[100];
- long stored;
- short nKeys = 0;
-
- retreive_keywords(db)
- database* db;
- {
- long i,j, k, l, tmp, limit;
- char file[MAX_FILE_NAME_LEN + 1 ];
- #ifdef WIN32
- double x;
- #else
- double x,y;
- #endif
- char *ptr;
- int tmpval;
-
- if(NULL == dictionary_header) {
- FILE *stream = db->dictionary_stream;
- s_fclose(stream);
- db->dictionary_stream = s_fopen(dictionary_filename(file, db), "r+b");
- stream = db->dictionary_stream;
- s_fseek(stream, 0L, SEEK_SET);
- number_of_blocks = read_bytes(DICTIONARY_HEADER_SIZE,stream);
- dictionary_header=
- read_dictionary_block(dictionary_header,DICTIONARY_HEADER_SIZE,
- number_of_blocks,stream);
- if(NULL == dictionary_header) {
- printf("Could not read dictionary header block in db %s.", db->database_file);
- return(0);
- }
- }
- look_up_total_word_count(db);
- stored = 0;
- for (i=0; i<=number_of_blocks; i++) {
- FILE *stream = db->dictionary_stream;
- dictionary = read_dictionary_block(dictionary,dictionary_block_position(i, dictionary_header),
- DICTIONARY_BLOCK_SIZE, stream);
- for (j=0; j<=DICTIONARY_BLOCK_SIZE; j++)
- if (strlen(dictionary_block_word(j, dictionary))) {
- tmp = dictionary_block_word_occurances(j, dictionary);
- if (tmp == db->total_word_count)
- goto done;
- k=0;
- while (k<stored) {
- if (tmp > keyvalue[k])
- break;
- k++;
- }
- if (k == stored) {
- if (stored < 50) {
- keyvalue[stored]= tmp;
- tmpval=strlen(dictionary_block_word(j, dictionary))+1;
- keyword[stored++] = s_malloc(tmpval);
- strcpy(keyword[stored-1], dictionary_block_word(j, dictionary));
- }
- } else {
- if (stored < 50)
- stored++;
- for (l=stored-1; l>k; l--) {
- if (l==49)
- free(keyword[l]);
- keyword[l] = keyword[l-1];
- keyvalue[l] = keyvalue[l-1];
- }
- tmpval=strlen(dictionary_block_word(j, dictionary))+1;
- keyword[k] = s_malloc(tmpval);
- strcpy(keyword[k], dictionary_block_word(j, dictionary));
- keyvalue[k] = tmp;
- }
- }
- }
- done:
- /* done getting, now cull and sort */
- x = 0;
- for (i=0; i<50 && x<0.2; i++)
- x += ((double) keyvalue[i])/db->total_word_count;
- limit = i;
- /* patch from tovio@sage.ucs.uwa.edu.au to fix problem with empty
- * keywords array
- */
- if (i>0)
- for (i=0; i<=limit-1; i++) {
- for (j=i; j<limit; j++)
- if (strcmp(keyword[i],keyword[j])>0) {
- ptr=keyword[i]; keyword[i]=keyword[j]; keyword[j]=ptr;
- }
- }
- #ifdef WIN32
- nKeys = (short)limit;
- #else
- nKeys = limit;
- #endif
- }
-